bots = [
    "(compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
    "(compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
    "(compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
    "(compatible; AhrefsBot/7.0; +http://ahrefs.com/robot/)",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36",
    "(compatible; BLEXBot/1.0; +http://webmeup-crawler.com/)",
    "Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
    "(compatible; SemrushBot/7~bl; +http://www.semrush.com/bot.html)",
    "(compatible; Neevabot/1.0; +https://neeva.com/neevabot)",
    "(compatible; DataForSeoBot/1.0; +https://dataforseo.com/dataforseo-bot)"
]
#一些爬虫带有的关键字特征列表

with open("fcode-abox_workview1-www.w3cschool.cn-2021-10-16.log", "r+") as log:
#打开日志文件，需要手动修改文件名来调整不同的文件
    lines = log.readlines()
#输出记录条数
    print("记录条数{:d}".format(len(lines)))

    # 数据清洗
    for bot in bots:
        count = 0
        for line in lines:
            # 如果记录中有bot中的内容，则将记录用000\n替换
            if line.find(bot) != -1:
                lines[count] = "000\n"
            count = count + 1

    # 删除标记
    set1 = set(lines)
    lines = list(set1)
    # 通过集合的特性，将所有的000\n合并成一个
    lines.remove("000\n")
    # 删除最后一个000\n

    iplist = []
    for line in lines:
        ip = line.split("-", 1)[0]  # 取每条记录的ip
        iplist.append(ip)  # 添加到ip列表中
    #输出不重复的ip
    print("不重复的ip：{:d}".format(len(iplist)))
    #输出清理掉爬虫后剩余的条数
    print("剩余记录条数{:d}".format(len(lines)))
#生成结果文件名
    result = log.name[-14:-4] + "result.log"
#写结果文件（结果文件以日志文件的日期+result为名称）输出在同一文件夹下
    for line in lines:
        with open(result, 'a', encoding='UTF-8') as f:
            f.write(line)
            f.close()
log.close()
